# Loading data
## Books
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/Books/")
stats.books <- read.csv(file="dataset_stats_books.tsv",head=T,sep="\t")
books.nodes <- read.csv(file="books_nodes.tsv",head=T,sep="\t")
books.edges <- read.csv(file="books_edges.tsv",head=T,sep="\t")

## Wikipedia
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/Wiki/")
stats.wiki <- read.csv(file="dataset_stats_wikipedia.tsv",head=T,sep="\t")
wiki.nodes <- read.csv(file="wikipedia_nodes.tsv",head=T,sep="\t")
wiki.edges <- read.csv(file="wikipedia_edges.tsv",head=T,sep="\t")

## Twitter
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/Twitter/")
stats.twitter <- read.csv(file="dataset_stats_twitter.tsv",head=T,sep="\t")
twitter.nodes <- read.csv(file="twitter_gln_nodes.tsv",head=T,sep="\t")
twitter.edges <- read.csv(file="twitter_gln_links_with_over_expression.tsv",head=T,sep="\t")

## Utility
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/Utility/")
gdp.country <- read.csv(file="gdp_pc_population_by_country.tsv",head=T,sep="\t")
gdp.language <- read.csv(file="gdp_pc_population_by_language.tsv",head=T,sep="\t")
language.conversion <- read.csv(file="language_conversion_table_iso639-3.tsv",head=T,sep="\t")
language.speakers <- read.csv(file="language_speakers_families_iso639-3.tsv",head=T,sep="\t")

## Famous
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/Famous individuals/")
famous.murr.country <- read.csv(file="Famous_murray_by_country.tsv",head=T,sep="\t")
famous.murr.language <- read.csv(file="Famous_murray_by_language.tsv",head=T,sep="\t")
famous.wiki.country <- read.csv(file="Famous_wikipedia_by_country.tsv",head=T,sep="\t")
famous.wiki.language <- read.csv(file="Famous_wikipedia_by_language.tsv",head=T,sep="\t")

## Analysis
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/Analysis/")
Analysis <- read.csv(file="centralities_by_language.tsv",head=T,sep="\t")
Analysis$language.name <- as.character(Analysis$language.name)
Analysis$language.name <- gsub(" \\(macrolanguage\\)", "", Analysis$language.name)

## Country names and codes
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/Countries/")
country <- read.csv(file="count_name_code.csv",head=T,sep=",")
country$Name <- as.character(country$Name)
country$Name[country$Name=="Russian Federation"] <- "Russia"

## GDELT
setwd("/Users/edmondawad/Dropbox (MIT)/Gov2001/Project/Languages datasets/gdelt/")
##MC
MC <- read.csv(file="out_qc2_agg10.csv",head=T,sep=",")
library(dplyr)

# Preprocessing
LangLink <- merge(MC,books.edges[,c(1,2,7)],by.x=c("from_lang","to_lang"),by.y=c("SourceLanguageCode","TargetLanguageCode"))
names(LangLink)[4] <- "books"
LangLink <- merge(LangLink,wiki.edges[,c(1,2,7)],by.x=c("from_lang","to_lang"),by.y=c("SourceLanguageCode","TargetLanguageCode"))
names(LangLink)[5] <- "wiki"
LangLink <- merge(LangLink,twitter.edges[,c(1,2,5)],by.x=c("from_lang","to_lang"),by.y=c("Source","Target"))
names(LangLink)[6] <- "twitter"

gdp.dict<-new.env()
pop.dict<-new.env()
for(i in seq(nrow(gdp.language)))
{  gdp.dict[[ as.character(gdp.language[i,1]) ]]<- gdp.language[i,2]
pop.dict[[ as.character(gdp.language[i,1]) ]]<- gdp.language[i,3]}

gdp1 <- rep(0,nrow(LangLink))
pop1 <- rep(0,nrow(LangLink))

gdp2 <- rep(0,nrow(LangLink))
pop2 <- rep(0,nrow(LangLink))

for(i in seq(nrow(LangLink)))
{  gdp1[i] <- gdp.dict[[as.character(LangLink$from_lang[i])]]
pop1[i] <- pop.dict[[as.character(LangLink$from_lang[i])]]
gdp2[i] <- gdp.dict[[as.character(LangLink$to_lang[i])]]
pop2[i] <- pop.dict[[as.character(LangLink$to_lang[i])]]
}
LangLink <- cbind(LangLink,gdp1,pop1,gdp2,pop2)
LangLink$pop1[LangLink$pop1 == 0] <- 0.0000001
LangLink$pop2[LangLink$pop2 == 0] <- 0.0000001

# Regressions
l1 <- lm(formula = log10(count_actions) ~ log10(pop1) +log10(gdp1)+log10(pop2) +log10(gdp2)+log10(pop1)*log10(pop2)+log10(gdp1)*log10(gdp2), data=LangLink)
l2 <- lm(formula = log10(count_actions) ~ log10(twitter), data=LangLink)
l3 <- lm(formula = log10(count_actions) ~ log10(wiki), data=LangLink)
l4 <- lm(formula = log10(count_actions) ~ log10(books), data=LangLink)
l5 <- lm(formula = log10(count_actions) ~ log10(pop1) + log10(gdp1) + log10(pop2) + log10(gdp2)+log10(pop1)*log10(pop2)+log10(gdp1)*log10(gdp2)+ log10(twitter), data=LangLink)
l6 <- lm(formula = log10(count_actions) ~ log10(pop1) + log10(gdp1)+ log10(pop2) + log10(gdp2)+log10(pop1)*log10(pop2)+log10(gdp1)*log10(gdp2)+log10(wiki), data=LangLink)
l7 <- lm(formula = log10(count_actions) ~ log10(pop1) + log10(gdp1)+ log10(pop2) + log10(gdp2)+log10(pop1)*log10(pop2)+log10(gdp1)*log10(gdp2)+log10(books), data=LangLink)

anova(l1,l5)
anova(l1,l6)
anova(l1,l7)

# Checking for the soundness of the models
N=nrow(LangLink)
fit <- l1
bread <-vcov(fit)
est.fun <- estfun(fit)
meat <- t(est.fun)%*%est.fun
robust <- sandwich(fit, meat=crossprod(est.fun)/N)
robustse <- sqrt(diag(robust))
robustse

library(gvlma)
fit <- l1
gvmodel <- gvlma(fit) 
summary(gvmodel)

# Output into a table
library(stargazer)
## This will produce an html file.
stargazer(l1, l2, l3, l4, l5, l6, l7, title="", font.size = "footnotesize",           
          align=TRUE, type="html", out="Fig5MC.html",
          covariate.labels = c("log<sub>10</sub>(Population1)","log<sub>10</sub>(GDP1)", "log<sub>10</sub>(Population2)","log<sub>10</sub>(GDP2)",
                               "Twitter", "Wikipedia","book trans.", "(intercept)"),
          dep.var.labels.include = FALSE, omit.stat = c("ser", "f"),
          dep.var.caption = "Amount of material cooperation actions taken between two <br> languages, based on actions between countries.",
          notes = "***,**,* significant at 0.1%, 1% and 5% levels, respectively. Standard errors in parentheses. <br> Only languages with at least one famous person are included.",
          notes.label = "", notes.append = FALSE, notes.align = "l")

stargazer(l1, l5, title="", font.size = "footnotesize",           
          align=TRUE, type="html", out="Fig5A.html",
          covariate.labels = c("log<sub>10</sub>(Population1)","log<sub>10</sub>(GDP1)", 
                               "log<sub>10</sub>(Population2)","log<sub>10</sub>(GDP2)",
                               "Twitter",
                               "log<sub>10</sub>(Population1)*log<sub>10</sub>(Population2)",
                               "log<sub>10</sub>(GDP1)*log<sub>10</sub>(GDP2)",
                               "(intercept)"),
          dep.var.labels.include = FALSE, omit.stat = c("ser", "f"),
          dep.var.caption = "Amount of material cooperation actions taken between two <br> languages, based on actions between countries.",
          notes = "***,**,* significant at 0.1%, 1% and 5% levels, respectively. Standard errors in parentheses.",
          notes.label = "", notes.append = FALSE, notes.align = "l")


fitted.all <- data.frame(cbind(l1$fitted.values,l2$fitted.values,l3$fitted.values,l4$fitted.values,
                               l5$fitted.values,l6$fitted.values,l7$fitted.values))
colnames(fitted.all) <- c("Model1","Model2","Model3","Model4","Model5","Model6","Model7");
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y))
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste0(prefix, txt)
  if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
  text(0.5, 0.5, txt, cex = cex.cor * r)
}
my_line <- function(x,y,...){
  points(x,y,...)
  abline(a = lm(y ~ x)$coefficients[1] , b = lm(y ~ x)$coefficients[2] , ...)
}
pdf(file="FigpairsMC.pdf", width = 12, height = 6)
pairs(fitted.all,lower.panel = panel.cor, upper.panel=my_line)  
dev.off()


## Causal Inference Analysis

### Alternative models -- Descritizing the persistence.
#LangLink$twitter.is.persistent <- NULL

twitter.is.persistent <- log10(LangLink$twitter)>3
twitter.is.persistent <- factor(twitter.is.persistent,labels=c("not persistent","persistent"))
LangLink <- cbind(LangLink,twitter.is.persistent)

wikipedia.is.persistent <- log10(LangLink$wiki)>3
wikipedia.is.persistent <- factor(wikipedia.is.persistent,labels=c("not persistent","persistent"))
LangLink <- cbind(LangLink,wikipedia.is.persistent)

books.is.persistent <- log10(LangLink$books)>3
books.is.persistent <- factor(books.is.persistent,labels=c("not persistent","persistent"))
LangLink <- cbind(LangLink,books.is.persistent)

## Modified models
l5.mod <- lm(formula = log10(count_actions) ~ log10(pop1)+log10(pop2) + log10(gdp1)+log10(gdp2)+ twitter.is.persistent, data=LangLink)
l6.mod <- lm(formula = log10(count_actions) ~ log10(pop1)+log10(pop2) + log10(gdp1)+log10(gdp2)+ wikipedia.is.persistent, data=LangLink)
l7.mod <- lm(formula = log10(count_actions) ~ log10(pop1)+log10(pop2) + log10(gdp1)+log10(gdp2)+ books.is.persistent, data=LangLink)

## How correlated were the previous models within themselves?
plot(l5$fitted.values,l6$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l6$fitted.values~l5$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l5$fitted.values,l6$fitted.value)

plot(l5$fitted.values,l7$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l7$fitted.values~l5$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l5$fitted.values,l7$fitted.value)

plot(l6$fitted.values,l7$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l7$fitted.values~l6$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l6$fitted.values,l7$fitted.value)

## How correlated are the new models to the corresponding previous models?
plot(l5.mod$fitted.values,l5$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l5$fitted.values~l5.mod$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l5.mod$fitted.values,l5$fitted.value)

plot(l6.mod$fitted.values,l6$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l6$fitted.values~l6.mod$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l6.mod$fitted.values,l6$fitted.value)

plot(l7.mod$fitted.values,l7$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l7$fitted.values~l7.mod$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l7.mod$fitted.values,l7$fitted.value)

## How correlated are the modified models within themselves? -- In-Sample Fit Chart
plot(l5.mod$fitted.values,l6.mod$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l6.mod$fitted.values~l5.mod$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l5.mod$fitted.values,l6.mod$fitted.value)

plot(l5.mod$fitted.values,l7.mod$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l7.mod$fitted.values~l5.mod$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l5.mod$fitted.values,l7.mod$fitted.value)

plot(l6.mod$fitted.values,l7.mod$fitted.values)
abline(a=0,b=1,lty=4)
adjr2 = round(summary(lm(l7.mod$fitted.values~l6.mod$fitted.values))$adj.r.squared,2)
legend("bottomright", bty="n", legend=paste("R2 =", format(adjr2*100),"%"))
cor(l6.mod$fitted.values,l7.mod$fitted.value)

# Convex Hull -- Checking for extrapolation bias
library(Zelig)
library(WhatIf)
# Twitter
## Hull 1: Switch 1 to 0
a<-whatif(~log10(pop1)+log10(pop2) + log10(gdp1)+log10(gdp2)+ twitter.is.persistent, 
          data=LangLink[LangLink$twitter.is.persistent=="not persistent",], 
          cfact=LangLink[LangLink$twitter.is.persistent=="persistent",]) 
summary(a)

## Hull 2: Switch 0 to 1
b<-whatif(~log10(pop1)+log10(pop2) + log10(gdp1)+log10(gdp2)+ twitter.is.persistent,  
          data=LangLink[LangLink$twitter.is.persistent=="persistent",], 
          cfact=LangLink[LangLink$twitter.is.persistent=="not persistent",]) 
summary(b)

# Wikipedia
## Hull 1: Switch 1 to 0
a<-whatif(~log10(pop1)+log10(pop2) + log10(gdp1)+log10(gdp2)+ wikipedia.is.persistent, 
          data=LangLink[LangLink$wikipedia.is.persistent=="not persistent",], 
          cfact=LangLink[LangLink$wikipedia.is.persistent=="persistent",]) 
summary(a)

## Hull 2: Switch 0 to 1
b<-whatif(~log10(pop1)+log10(pop2) + log10(gdp1)+log10(gdp2)+ books.is.persistent,  
          data=LangLink[LangLink$books.is.persistent=="persistent",], 
          cfact=LangLink[LangLink$books.is.persistent=="not persistent",]) 
summary(b)

# Books
## Hull 1: Switch 1 to 0
a<-whatif(~log10(pop1)+log10(pop2) + log10(gdp1)+log10(gdp2)+ books.is.persistent, 
          data=LangLink[LangLink$books.is.persistent=="not persistent",], 
          cfact=LangLink[LangLink$books.is.persistent=="persistent",]) 
summary(a)

## Hull 2: Switch 0 to 1
b<-whatif(~log10(pop1)+log10(pop2) + log10(gdp1)+log10(gdp2)+ books.is.persistent,  
          data=LangLink[LangLink$books.is.persistent=="persistent",], 
          cfact=LangLink[LangLink$books.is.persistent=="not persistent",]) 
summary(b)



# Measuring imbalance
library(cem)
imbalance.t <- imbalance(group=LangLink$twitter.is.persistent, data=LangLink[,seq(7,10)])
imbalance.t
imbalance.w <- imbalance(group=LangLink$wikipedia.is.persistent, data=LangLink[,seq(7,10)])
imbalance.w
imbalance.b <- imbalance(group=LangLink$books.is.persistent, data=LangLink[,seq(7,10)])
imbalance.b

# Matching
library(rgenoud)
library(Matching)
library(MatchIt)

### Twitter
## Using CEM 
cem.match <- cem(treatment="twitter.is.persistent",
                 data=LangLink[,c(7,8,9,10,11)])
data_cem3 <- LangLink[cem.match$matched ,]
post.imbalance.cem3 <- imbalance(group=data_cem3$twitter.is.persistent, data=data_cem3[,c(7,8,9,10)])
post.imbalance.cem3

matched.t <- data_cem3

treated.mean.t <- mean(matched.t$count_actions[matched.t$twitter.is.persistent=="persistent"])
controlled.mean.t <- mean(matched.t$count_actions[matched.t$twitter.is.persistent=="not persistent"])

treated.mean.t - controlled.mean.t

(treated.mean.t - controlled.mean.t)/controlled.mean.t

treated.prob.t <- treated.mean.t/min(mean(matched.t$pop1[matched.t$twitter.is.persistent=="persistent"]),
                                     mean(matched.t$pop2[matched.t$twitter.is.persistent=="persistent"]))
controlled.prob.t <- controlled.mean.t/min(mean(matched.t$pop1[matched.t$twitter.is.persistent=="not persistent"]),
                                           mean(matched.t$pop2[matched.t$twitter.is.persistent=="not persistent"]))

(treated.prob.t - controlled.prob.t)/controlled.prob.t

l5 <- lm(formula = log10(count_actions) ~ log10(pop1) + log10(gdp1) + log10(pop2) + log10(gdp2)+log10(pop1)*log10(pop2)+log10(gdp1)*log10(gdp2)+ log10(twitter), data=matched.t)

### Wikipedia
## Using CEM 
cem.match <- cem(treatment="wikipedia.is.persistent",
                 data=LangLink[,c(7,8,9,10,12)])
data_cem3 <- LangLink[cem.match$matched ,]
post.imbalance.cem3 <- imbalance(group=data_cem3$wikipedia.is.persistent, data=data_cem3[,c(7,8,9,10)])
post.imbalance.cem3

matched.w <- data_cem3
treated.mean.w <- mean(matched.w$count_actions[matched.w$wikipedia.is.persistent=="persistent"])
controlled.mean.w <- mean(matched.w$count_actions[matched.w$wikipedia.is.persistent=="not persistent"])

treated.mean.w - controlled.mean.w

treated.prob.w <- treated.mean.w/min(mean(matched.t$pop1[matched.t$wikipedia.is.persistent=="persistent"]),
                                     mean(matched.t$pop2[matched.t$wikipedia.is.persistent=="persistent"]))
controlled.prob.w <- controlled.mean.w/min(mean(matched.t$pop1[matched.t$wikipedia.is.persistent=="not persistent"]),
                                           mean(matched.t$pop2[matched.t$wikipedia.is.persistent=="not persistent"]))
(treated.prob.w - controlled.prob.w)/controlled.prob.w



# For books
## Using CEM 
cem.match <- cem(treatment="books.is.persistent",
                 data=LangLink[,c(7,8,9,10,13)])
data_cem3 <- LangLink[cem.match$matched ,]
post.imbalance.cem3 <- imbalance(group=data_cem3$books.is.persistent, data=data_cem3[,c(7,8,9,10)])
post.imbalance.cem3

matched.b <- data_cem3
treated.mean.b <- mean(matched.b$count_actions[matched.b$books.is.persistent=="persistent"])
controlled.mean.b <- mean(matched.b$count_actions[matched.b$books.is.persistent=="not persistent"])

treated.mean.b - controlled.mean.b

treated.prob.b <- treated.mean.b/min(mean(matched.t$pop1[matched.t$books.is.persistent=="persistent"]),
                                     mean(matched.t$pop2[matched.t$books.is.persistent=="persistent"]))
controlled.prob.b <- controlled.mean.b/min(mean(matched.t$pop1[matched.t$books.is.persistent=="not persistent"]),
                                           mean(matched.t$pop2[matched.t$books.is.persistent=="not persistent"]))
(treated.prob.b - controlled.prob.b)/controlled.prob.b






